# -*- coding: utf-8 -*-

# #保存数据
# import cPickle
#
# import itertools
#
# #处理事件字符串
# import datetime
#
# import numpy as np
# import scipy.io as sio
# import scipy.sparse as ss
#
# #相似度/距离
# import scipy.spatial.distance as ssd
#
# from collections import defaultdict
# from sklearn.preprocessing import normalize
import pandas as pd

def collectEventIds(filename):
    events = set()
    f = open(filename, 'rb')
    # 忽略第一行（列名字）
    f.readline().strip().split(",")
    for line in f:  # 对每条记录
        cols = line.strip().split(",")
        events.add(cols[1])  # 第二列为活动ID
    f.close()
    return events

train_event_ids = collectEventIds("/Users/wuzhong/gitee/homework4/train.csv")
test_events_ids = collectEventIds("/Users/wuzhong/gitee/homework4/test.csv")

print train_event_ids

# 读取 events.csv 保存到本地
events = pd.read_csv("/Users/wuzhong/gitee/homework4/events.csv")

train_events = events.loc[events['event_id'].isin(train_event_ids)]
print "train_events.info()"
print train_events.info()
train_events.to_csv("train_events.csv")
print "saved to train_events.csv"

test_events = events.loc[events['event_id'].isin(test_events_ids)]
print "test_events.info()"
print test_events.info()
test_events.to_csv("test_events.csv")
print "saved to test_events.csv"

print events.info()

